#!/usr/bin/env python

# Copyright © 2012 marmuta <marmvta@gmail.com>
#
# This file is part of Onboard.
#
# Onboard is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# Onboard is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import sys
import pypredict
from pypredict import *


# usage: 
# multilevel [history] prefix
# e.g. multilevel www test ""

def main():
    model = CachedDynamicModel()

    training_text = u"""
        No, when I go to sea, I go as a simple sailor, right before the mast,
        plumb down into the forecastle, aloft there to the royal mast-head.
        True, they rather order me about some, and make me jump from spar to
        spar, like a grasshopper in a May meadow. And at first, this sort
        of thing is unpleasant enough. And more than all,
        if just previous to putting your hand into the tar-pot, you have been
        lording it as a country schoolmaster, making the tallest boys stand
        in awe of you. The transition is a keen one, I assure you, from a
        schoolmaster to a sailor, and requires a strong decoction of Seneca and
        the Stoics to enable you to grin and bear it. But even this wears off in
        time.
        www.test.com
        www.gnome.org
        """
    tokens, spans = tokenize_text(training_text)
    model.learn_tokens(tokens)
    context = [unicode(w) for w in sys.argv[1:] or [u""]]
    model.recency_ratio = 1
    limit = 20
    
    with timeit("predict (10)"):
        
        phrases2 = []  # second level lookup
        phrases3 = []  # third level lookup
        
        choices = model.predictp(context, limit)
        for word,p in choices:
#            www test o*
#            www test org *
#            www test org bla *
#                
#            www t*
#            www test *
#            www test org *
#            
#            t*
#            test *
#            test org *
            context2 = context[:-1] + [word] + [u""]
            choices2 = model.predictp(context2, limit)
            
            phrases2 = phrases2 + [[word + " " + word2, p*p2] for word2,p2 in choices2]

            for word2,p2 in choices2:
                context3 = context2[:-1] + [word2] + [u""]
                choices3 = model.predictp(context3, limit)
                
                phrases3 = phrases3 + [[word + " " + word2 + " " + word3, p2*p3] for word3,p3 in choices3]

    phrases2.sort(key=lambda x: x[1], reverse=True)
    phrases3.sort(key=lambda x: x[1], reverse=True)
    
#    for phrase,p in phrases2[:10]
#        print phrase,p
        
    print_choices(model, context, choices)
    print_choices(model, context, phrases2)
    print_choices(model, context, phrases3)


def print_choices(model, context, choices):
    n   = min(model.order, len(context))
    history = context[-n:-1]
    prefix  = context[-1]

    print
    print "history:", history, "prefix '%s' " % prefix

    psum = 0
    counts = []
    for x in choices:
        ngram = history + [x[0]]
        psum += x[1]
        padding = max(model.order-len(context),0)
        ng = [u""]*padding + ngram
        counts.append([model.get_ngram_count(ng[i:]) for i in range(model.order)])

    print "Probability sum %f for %d results" % (psum,len(choices))   # ought to be 1.0 for the whole vocabulary
    print "Words with zero probability: ", sum(1 for x in choices if x[1] == 0)
    for i,x in enumerate(choices[:20]):
        print "%10f " % x[1] + "".join("%8d " % c for c in counts[i]) + "'%s'" % x[0]

if __name__ == '__main__':
    main()

